Raw data

- GEO number: GSE128615

- Study summary: To determine the influence of differential Kdm6a expression in immune cells, whole transcriptome analysis for CD4+ T cells from WT and Kdm6a cKO mice were performed using RNA-Seq.

Loading packages

- DESeq2: http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html

library(data.table)
library(tidyverse)
library(rmarkdown)
library(ggplot2)
library(pheatmap)
library(AnnotationHub)
library(tximport)
library(Rsubread)
library(DESeq2)
library(UpSetR)

Setting AnnotationHub

Assign your species of interest

AnnotationSpecies <- "mus musculus"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))  # Bring annotation DB

Running AnnotationHub

ahQuery <- query(ah, c("OrgDb", AnnotationSpecies))      # Filter annotation of interest
if (length(ahQuery) == 1) {
    DBName <- names(ahQuery)
} else if (length(ahQuery) > 1) {
               DBName <- names(ahQuery)[1]
} else {
    print("You don't have a valid DB")
    rmarkdown::render() 
} 

AnnoDb <- ah[[DBName]] # Store into an OrgDb object  
# Explore your OrgDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="ENSEMBLTRANS")
# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="ENSEMBLTRANS",
                 columns="ENSEMBL")

# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)    # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
##         ENSEMBLTRANS            ENSEMBL
## 1 ENSMUST00000030010 ENSMUSG00000015243
## 2 ENSMUST00000149127 ENSMUSG00000015243
## 3 ENSMUST00000033695 ENSMUSG00000031333
## 4 ENSMUST00000140333 ENSMUSG00000031333
## 5 ENSMUST00000236391 ENSMUSG00000024030
## 6 ENSMUST00000024829 ENSMUSG00000024030

Metadata setting

# This code chunk needs to be written by yourself 

# Define sample names 
SampleNames <- c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3))  

# Aligner names
Aligners <- c("Salmon", "STAR", "HISAT2")

# Define group level
GroupLevel <- c("WT", "cKO")

# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)

# Set a function for file paths
path.fn <- function(head, tail) { 

    vec <- c(paste0(head,    # head = e.g. "hisat2", "star", or "salmon"
                    "_output/",
                    SampleNames,
                    tail))   # tail = file name after SampleNames 

    return(vec)
}


# Define .sf file path
sf <- path.fn("salmon",
              ".salmon_quant/quant.sf")


# Define STAR file path
star <- path.fn("star", 
                "Aligned.sortedByCoord.out.bam")

# Define HISAT2 file path
hisat <- path.fn("hisat2",
                 ".sorted.bam")


# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Salmon_path=sf,
                       STAR_path=star, 
                       HISAT2_path=hisat)

# Assign row names with sample names
rownames(metadata) <- SampleNames


# Explore the metadata
print(metadata)
##            Sample Group                                  Salmon_path
## WT-rep1   WT-rep1    WT  salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2   WT-rep2    WT  salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3   WT-rep3    WT  salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1   cKO salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2   cKO salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3   cKO salmon_output/cKO-rep3.salmon_quant/quant.sf
##                                                  STAR_path
## WT-rep1   star_output/WT-rep1Aligned.sortedByCoord.out.bam
## WT-rep2   star_output/WT-rep2Aligned.sortedByCoord.out.bam
## WT-rep3   star_output/WT-rep3Aligned.sortedByCoord.out.bam
## cKO-rep1 star_output/cKO-rep1Aligned.sortedByCoord.out.bam
## cKO-rep2 star_output/cKO-rep2Aligned.sortedByCoord.out.bam
## cKO-rep3 star_output/cKO-rep3Aligned.sortedByCoord.out.bam
##                                HISAT2_path
## WT-rep1   hisat2_output/WT-rep1.sorted.bam
## WT-rep2   hisat2_output/WT-rep2.sorted.bam
## WT-rep3   hisat2_output/WT-rep3.sorted.bam
## cKO-rep1 hisat2_output/cKO-rep1.sorted.bam
## cKO-rep2 hisat2_output/cKO-rep2.sorted.bam
## cKO-rep3 hisat2_output/cKO-rep3.sorted.bam

featureCounts parameter setting

# "mm10", "mm9", "hg38", or "hg19"
# cf. mm10 = GRCm38
annot.inbuilt <- "mm10" 

# GTF file path
annot.ext <- "../mouse_reference/gencode.m25.primary_assembly.annotation.gtf"

# annotation type:
# e.g.: "gene_id", "transcript_id", or "gene_name"
GTF.attrType <- "gene_id"

# Number of cores 
nthread <- 16 

# Set a function importing counts from BAM files with featureCounts()
fcounts.fn <- function(vec) {

    fc <- featureCounts(files=vec,   # a vector assigning BAM file paths
                         annot.inbuilt=annot.inbuilt,
                         annot.ext=annot.ext,
                         GTF.attrType=GTF.attrType,
                         isGTFAnnotationFile=T,
                         nthread=nthread, 
                         isPairedEnd=F, # Set this parameter correctly 
                         verbose=T)

    return(fc$counts)
}

Importing counts

Importing Salmon counts

Note:

txi1 <- tximport(…, txOut=F)

txi2 <- tximport(…, txOut=T)

txi2 <- summarizedToGene(…)

counts extracted from txi1 and txi2 are the same

# Import gene level summarized counts 
salmon.txi <- tximport(metadata$Salmon_path,
                       type = "salmon",
                       tx2gene=AnnoDb,
                       ignoreTxVersion=T, 
                       txOut=F)       # TRUE for transcript level, FALSE for gene level 

# Extract the counts and save as a data frame
salmon.counts <- salmon.txi$counts

# Explore the salmon count data frame
head(salmon.counts)
##                    [,1]     [,2] [,3]     [,4]     [,5]     [,6]
## ENSMUSG00000000001 6492 5822.000 5614 6040.000 6142.000 5437.000
## ENSMUSG00000000056 1637 1458.000 1395 1579.000 1608.999 1398.000
## ENSMUSG00000000058    5    5.000    4    3.000    1.000    3.000
## ENSMUSG00000000078 3344 2807.000 2828 3589.000 3731.000 2912.000
## ENSMUSG00000000088 4558 4149.929 3958 4189.921 4151.000 3623.928
## ENSMUSG00000000093   10    8.000    8    7.000    4.000    4.000
dim(salmon.counts)
## [1] 15022     6
summary(salmon.counts)
##        V1                 V2                 V3                 V4          
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0   Median :     0.0   Median :     0.0  
##  Mean   :   675.0   Mean   :   603.9   Mean   :   585.8   Mean   :   613.8  
##  3rd Qu.:    29.3   3rd Qu.:    27.0   3rd Qu.:    25.0   3rd Qu.:    28.0  
##  Max.   :357534.3   Max.   :322612.0   Max.   :318293.5   Max.   :343971.6  
##        V5                 V6           
##  Min.   :     0.0   Min.   :     0.00  
##  1st Qu.:     0.0   1st Qu.:     0.00  
##  Median :     0.0   Median :     0.00  
##  Mean   :   618.5   Mean   :   531.85  
##  3rd Qu.:    27.0   3rd Qu.:    22.94  
##  Max.   :337557.2   Max.   :305036.86

Importing STAR counts

# Extract counts by running featureCounts() 
star.counts <- fcounts.fn(metadata$STAR_path)
## 
##         ==========     _____ _    _ ____  _____  ______          _____  
##         =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
##           =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
##             ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
##               ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
##         ==========   |_____/ \____/|____/|_|  \_\______/_/    \_\_____/
##        Rsubread 2.4.0
## 
## //========================== featureCounts setting ===========================\\
## ||                                                                            ||
## ||             Input files : 6 BAM files                                      ||
## ||                                                                            ||
## ||                           WT-rep1Aligned.sortedByCoord.out.bam             ||
## ||                           WT-rep2Aligned.sortedByCoord.out.bam             ||
## ||                           WT-rep3Aligned.sortedByCoord.out.bam             ||
## ||                           cKO-rep1Aligned.sortedByCoord.out.bam            ||
## ||                           cKO-rep2Aligned.sortedByCoord.out.bam            ||
## ||                           cKO-rep3Aligned.sortedByCoord.out.bam            ||
## ||                                                                            ||
## ||              Paired-end : no                                               ||
## ||        Count read pairs : no                                               ||
## ||              Annotation : gencode.m25.primary_assembly.annotation.gtf  ... ||
## ||      Dir for temp files : .                                                ||
## ||                 Threads : 16                                               ||
## ||                   Level : meta-feature level                               ||
## ||      Multimapping reads : counted                                          ||
## || Multi-overlapping reads : not counted                                      ||
## ||   Min overlapping bases : 1                                                ||
## ||                                                                            ||
## \\============================================================================//
## 
## //================================= Running ==================================\\
## ||                                                                            ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ...       ||
## ||    Features : 843712                                                       ||
## ||    Meta-features : 55487                                                   ||
## ||    Chromosomes/contigs : 45                                                ||
## ||                                                                            ||
## || Process BAM file WT-rep1Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 70746461                                             ||
## ||    Successfully assigned alignments : 55979059 (79.1%)                     ||
## ||    Running time : 0.07 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep2Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 63394034                                             ||
## ||    Successfully assigned alignments : 50159175 (79.1%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep3Aligned.sortedByCoord.out.bam...                   ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 61430033                                             ||
## ||    Successfully assigned alignments : 48522991 (79.0%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep1Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 64533984                                             ||
## ||    Successfully assigned alignments : 51163980 (79.3%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep2Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 65134349                                             ||
## ||    Successfully assigned alignments : 51790758 (79.5%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep3Aligned.sortedByCoord.out.bam...                  ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 56200709                                             ||
## ||    Successfully assigned alignments : 44128747 (78.5%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Write the final count table.                                               ||
## || Write the read assignment summary.                                         ||
## ||                                                                            ||
## \\============================================================================//
# Explore the STAR count data frame
head(star.counts)
##                      WT-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    1
## ENSMUSG00000104017.1                                    0
##                      WT-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    2
## ENSMUSG00000104017.1                                    0
##                      WT-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                    0
## ENSMUSG00000064842.1                                    0
## ENSMUSG00000051951.5                                    0
## ENSMUSG00000102851.1                                    0
## ENSMUSG00000103377.1                                    0
## ENSMUSG00000104017.1                                    2
##                      cKO-rep1Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     0
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     0
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     1
##                      cKO-rep2Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     0
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     0
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     0
##                      cKO-rep3Aligned.sortedByCoord.out.bam
## ENSMUSG00000102693.1                                     1
## ENSMUSG00000064842.1                                     0
## ENSMUSG00000051951.5                                     0
## ENSMUSG00000102851.1                                     1
## ENSMUSG00000103377.1                                     0
## ENSMUSG00000104017.1                                     0
dim(star.counts)
## [1] 55487     6
summary(star.counts)
##  WT-rep1Aligned.sortedByCoord.out.bam WT-rep2Aligned.sortedByCoord.out.bam
##  Min.   :     0                       Min.   :     0                      
##  1st Qu.:     0                       1st Qu.:     0                      
##  Median :     1                       Median :     1                      
##  Mean   :  1009                       Mean   :   904                      
##  3rd Qu.:    86                       3rd Qu.:    78                      
##  Max.   :811870                       Max.   :470203                      
##  WT-rep3Aligned.sortedByCoord.out.bam cKO-rep1Aligned.sortedByCoord.out.bam
##  Min.   :     0.0                     Min.   :     0.0                     
##  1st Qu.:     0.0                     1st Qu.:     0.0                     
##  Median :     1.0                     Median :     1.0                     
##  Mean   :   874.5                     Mean   :   922.1                     
##  3rd Qu.:    77.0                     3rd Qu.:    82.0                     
##  Max.   :388953.0                     Max.   :459595.0                     
##  cKO-rep2Aligned.sortedByCoord.out.bam cKO-rep3Aligned.sortedByCoord.out.bam
##  Min.   :     0.0                      Min.   :     0.0                     
##  1st Qu.:     0.0                      1st Qu.:     0.0                     
##  Median :     1.0                      Median :     1.0                     
##  Mean   :   933.4                      Mean   :   795.3                     
##  3rd Qu.:    81.0                      3rd Qu.:    70.0                     
##  Max.   :412338.0                      Max.   :390466.0

Importing HISAT2 counts

# Extract counts by running featureCounts() 
hisat2.counts <- fcounts.fn(metadata$HISAT2_path)
## 
##         ==========     _____ _    _ ____  _____  ______          _____  
##         =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
##           =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
##             ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
##               ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
##         ==========   |_____/ \____/|____/|_|  \_\______/_/    \_\_____/
##        Rsubread 2.4.0
## 
## //========================== featureCounts setting ===========================\\
## ||                                                                            ||
## ||             Input files : 6 BAM files                                      ||
## ||                                                                            ||
## ||                           WT-rep1.sorted.bam                               ||
## ||                           WT-rep2.sorted.bam                               ||
## ||                           WT-rep3.sorted.bam                               ||
## ||                           cKO-rep1.sorted.bam                              ||
## ||                           cKO-rep2.sorted.bam                              ||
## ||                           cKO-rep3.sorted.bam                              ||
## ||                                                                            ||
## ||              Paired-end : no                                               ||
## ||        Count read pairs : no                                               ||
## ||              Annotation : gencode.m25.primary_assembly.annotation.gtf  ... ||
## ||      Dir for temp files : .                                                ||
## ||                 Threads : 16                                               ||
## ||                   Level : meta-feature level                               ||
## ||      Multimapping reads : counted                                          ||
## || Multi-overlapping reads : not counted                                      ||
## ||   Min overlapping bases : 1                                                ||
## ||                                                                            ||
## \\============================================================================//
## 
## //================================= Running ==================================\\
## ||                                                                            ||
## || Load annotation file gencode.m25.primary_assembly.annotation.gtf ...       ||
## ||    Features : 843712                                                       ||
## ||    Meta-features : 55487                                                   ||
## ||    Chromosomes/contigs : 45                                                ||
## ||                                                                            ||
## || Process BAM file WT-rep1.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 63363619                                             ||
## ||    Successfully assigned alignments : 48324485 (76.3%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep2.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 56580014                                             ||
## ||    Successfully assigned alignments : 43217333 (76.4%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file WT-rep3.sorted.bam...                                     ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 54969075                                             ||
## ||    Successfully assigned alignments : 41849987 (76.1%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep1.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 57670244                                             ||
## ||    Successfully assigned alignments : 44111169 (76.5%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep2.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 58054098                                             ||
## ||    Successfully assigned alignments : 44514481 (76.7%)                     ||
## ||    Running time : 0.06 minutes                                             ||
## ||                                                                            ||
## || Process BAM file cKO-rep3.sorted.bam...                                    ||
## ||                                                                            ||
## || Chromosomes/contigs in input file but not in annotation                    ||
## ||    GL456359.1                                                              ||
## ||    GL456393.1                                                              ||
## ||    GL456389.1                                                              ||
## ||    JH584300.1                                                              ||
## ||    GL456368.1                                                              ||
## ||    GL456394.1                                                              ||
## ||    GL456360.1                                                              ||
## ||    JH584301.1                                                              ||
## ||    GL456390.1                                                              ||
## ||    GL456213.1                                                              ||
## ||    GL456382.1                                                              ||
## ||    GL456378.1                                                              ||
## ||    JH584302.1                                                              ||
## ||    GL456387.1                                                              ||
## ||    GL456370.1                                                              ||
## ||    GL456366.1                                                              ||
## ||    GL456383.1                                                              ||
## ||    GL456379.1                                                              ||
## ||    GL456396.1                                                              ||
## ||    GL456392.1                                                              ||
## ||    GL456367.1                                                              ||
## ||                                                                            ||
## ||    Single-end reads are included.                                          ||
## ||    Total alignments : 50205194                                             ||
## ||    Successfully assigned alignments : 38062143 (75.8%)                     ||
## ||    Running time : 0.05 minutes                                             ||
## ||                                                                            ||
## || Write the final count table.                                               ||
## || Write the read assignment summary.                                         ||
## ||                                                                            ||
## \\============================================================================//
# Explore the HISAT2 count data frame
head(hisat2.counts)
##                      WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam
## ENSMUSG00000102693.1                  0                  0                  0
## ENSMUSG00000064842.1                  0                  0                  0
## ENSMUSG00000051951.5                  0                  0                  0
## ENSMUSG00000102851.1                  0                  0                  0
## ENSMUSG00000103377.1                  1                  2                  0
## ENSMUSG00000104017.1                  0                  0                  2
##                      cKO-rep1.sorted.bam cKO-rep2.sorted.bam
## ENSMUSG00000102693.1                   0                   0
## ENSMUSG00000064842.1                   0                   0
## ENSMUSG00000051951.5                   0                   0
## ENSMUSG00000102851.1                   0                   0
## ENSMUSG00000103377.1                   0                   0
## ENSMUSG00000104017.1                   1                   0
##                      cKO-rep3.sorted.bam
## ENSMUSG00000102693.1                   1
## ENSMUSG00000064842.1                   0
## ENSMUSG00000051951.5                   0
## ENSMUSG00000102851.1                   1
## ENSMUSG00000103377.1                   0
## ENSMUSG00000104017.1                   0
dim(hisat2.counts)
## [1] 55487     6
summary(hisat2.counts)
##  WT-rep1.sorted.bam WT-rep2.sorted.bam WT-rep3.sorted.bam cKO-rep1.sorted.bam
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   
##  Median :     1.0   Median :     1.0   Median :     1.0   Median :     1.0   
##  Mean   :   870.9   Mean   :   778.9   Mean   :   754.2   Mean   :   795.0   
##  3rd Qu.:    85.0   3rd Qu.:    77.0   3rd Qu.:    76.0   3rd Qu.:    81.5   
##  Max.   :755391.0   Max.   :436946.0   Max.   :361243.0   Max.   :428302.0   
##  cKO-rep2.sorted.bam cKO-rep3.sorted.bam
##  Min.   :     0.0    Min.   :     0     
##  1st Qu.:     0.0    1st Qu.:     0     
##  Median :     1.0    Median :     1     
##  Mean   :   802.3    Mean   :   686     
##  3rd Qu.:    81.0    3rd Qu.:    69     
##  Max.   :383887.0    Max.   :363272

Data cleaning: sample and gene annotation

countList <- list(salmon.counts, 
                  star.counts,
                  hisat2.counts)

# Assign names of the count data frames in the count list
names(countList) <- Aligners

# Set a function cleaning the count data frame
clean.fn <- function(df) {

    # Convert to a data frame
    df <- as.data.frame(df)

    # Assign column names
    names(df) <- SampleNames

    # Bring row names to a column
    df <- df %>% rownames_to_column(var="GENEID")

    return(df)
}


# Set a function to drop GENEID version
clean.annotation.fn <- function(df) {

    # Re-annotate without version specification
    df <- separate(df, "GENEID", c("GENEID", "Version"))

    # Remove version column
    df <- df[, colnames(df) != "Version"]

    return(df)
}

# Move GENEID to a column
for (x in Aligners) {

    countList[[x]] <- clean.fn(countList[[x]])

}


# Remove version of GENEID and duplicated rows in STAR & HISAT2 count tables
for (x in Aligners) {

    countList[[x]] <- clean.annotation.fn(countList[[x]]) %>% 

        distinct()

}



# Explore the cleaned count data frames 
head(countList[[1]])
##               GENEID WT-rep1  WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001    6492 5822.000    5614 6040.000 6142.000 5437.000
## 2 ENSMUSG00000000056    1637 1458.000    1395 1579.000 1608.999 1398.000
## 3 ENSMUSG00000000058       5    5.000       4    3.000    1.000    3.000
## 4 ENSMUSG00000000078    3344 2807.000    2828 3589.000 3731.000 2912.000
## 5 ENSMUSG00000000088    4558 4149.929    3958 4189.921 4151.000 3623.928
## 6 ENSMUSG00000000093      10    8.000       8    7.000    4.000    4.000
head(countList[[2]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693       0       0       0        0        0        1
## 2 ENSMUSG00000064842       0       0       0        0        0        0
## 3 ENSMUSG00000051951       0       0       0        0        0        0
## 4 ENSMUSG00000102851       0       0       0        0        0        1
## 5 ENSMUSG00000103377       1       2       0        0        0        0
## 6 ENSMUSG00000104017       0       0       2        1        0        0
head(countList[[3]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000102693       0       0       0        0        0        1
## 2 ENSMUSG00000064842       0       0       0        0        0        0
## 3 ENSMUSG00000051951       0       0       0        0        0        0
## 4 ENSMUSG00000102851       0       0       0        0        0        1
## 5 ENSMUSG00000103377       1       2       0        0        0        0
## 6 ENSMUSG00000104017       0       0       2        1        0        0
dim(countList[[1]])
## [1] 15022     7
dim(countList[[2]])
## [1] 55487     7
dim(countList[[3]])
## [1] 55487     7
sum(duplicated(countList[[1]]))
## [1] 0
sum(duplicated(countList[[2]]))
## [1] 0
sum(duplicated(countList[[3]]))
## [1] 0
# Convert Salmon counts to integers 
countList[["Salmon"]] <- cbind(GENEID=countList[["Salmon"]][, "GENEID"],
                               round(countList[["Salmon"]][, 
                               colnames(countList[["Salmon"]]) %in% SampleNames]))

# Explore the cleaned count data frames 
head(countList[[1]])
##               GENEID WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1 ENSMUSG00000000001    6492    5822    5614     6040     6142     5437
## 2 ENSMUSG00000000056    1637    1458    1395     1579     1609     1398
## 3 ENSMUSG00000000058       5       5       4        3        1        3
## 4 ENSMUSG00000000078    3344    2807    2828     3589     3731     2912
## 5 ENSMUSG00000000088    4558    4150    3958     4190     4151     3624
## 6 ENSMUSG00000000093      10       8       8        7        4        4

Plotting sequencing depth

Number of total counts per sample

# Set a function generating a data frame with sequencing depth
seq.depth.fn <- function(df, aligner) {

    seqdf <- as.data.frame(colSums(df[, SampleNames])) %>% 
        rownames_to_column (var="Sample") %>% 
        mutate(Aligner=aligner)

    names(seqdf) <- c("Sample", "Count", "Aligner")

    return(seqdf)
}

# Set a function for a bar plot comparing values
comparing.barplot.fn <- function(df, yval, title, ytitle) {

    ggplot(df, 
       aes(x=Sample, y=yval, group=Aligner, fill=Aligner)) +
    geom_bar(stat="identity", position="dodge") +
    theme_bw() +
    ggtitle(title) + 
    ylab(ytitle)

}




# Initialize the seq depth data frame with the first aligner
seq.depth.df <- seq.depth.fn(countList[[1]], Aligners[1])

# Extend the seq depth data frame with the rest of aligners
for (x in Aligners) {

    if (x %in% Aligners[2:length(Aligners)]) {

        seq.depth.df <- rbind(seq.depth.df, 
                              seq.depth.fn(countList[[x]], x))
    }
}

# Explore how the data frame 
print(seq.depth.df)
##      Sample    Count Aligner
## 1   WT-rep1 10139658  Salmon
## 2   WT-rep2  9071265  Salmon
## 3   WT-rep3  8800358  Salmon
## 4  cKO-rep1  9219918  Salmon
## 5  cKO-rep2  9291690  Salmon
## 6  cKO-rep3  7989400  Salmon
## 7   WT-rep1 55979059    STAR
## 8   WT-rep2 50159175    STAR
## 9   WT-rep3 48522991    STAR
## 10 cKO-rep1 51163980    STAR
## 11 cKO-rep2 51790758    STAR
## 12 cKO-rep3 44128747    STAR
## 13  WT-rep1 48324485  HISAT2
## 14  WT-rep2 43217333  HISAT2
## 15  WT-rep3 41849987  HISAT2
## 16 cKO-rep1 44111169  HISAT2
## 17 cKO-rep2 44514481  HISAT2
## 18 cKO-rep3 38062143  HISAT2
summary(seq.depth.df)
##     Sample              Count            Aligner         
##  Length:18          Min.   : 7989400   Length:18         
##  Class :character   1st Qu.: 9503682   Class :character  
##  Mode  :character   Median :43664251   Mode  :character  
##                     Mean   :34240922                     
##                     3rd Qu.:48473364                     
##                     Max.   :55979059
# Convert character vectors to factors
seq.depth.df$Sample <- factor(seq.depth.df$Sample, 
                              levels=SampleNames)
seq.depth.df$Aligner <- factor(seq.depth.df$Aligner, 
                               levels=Aligners)

# Create a plot presenting sequencing depth
comparing.barplot.fn(seq.depth.df, 
                     seq.depth.df$Count, 
                     "Sequencing Depth by Sample and Aligner", 
                     "Count")

Generating DESeq2 objects

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Initialize new lists for storing dds objects
ddsList <- countList

# Initialize new lists for storing vsd objects
vsdList <- countList


for (x in Aligners) {

    # Create a count matrix from the count data frame 
    m <- countList[[x]][, colnames(countList[[x]]) != "GENEID"] %>% 
        as.matrix()

    # Assigne row names
    rownames(m) <- countList[[x]]$GENEID

    # Generate a DESeq2 object
    ddsList[[x]] <- DESeqDataSetFromMatrix(m, 
                                           colData=metadata, 
                                           design=~Group) 

    # Conduct vst
    vsdList[[x]] <- varianceStabilizingTransformation(ddsList[[x]], 
                                                      blind=TRUE) 
}

# Explore generated objects
summary(ddsList)
##        Length Class        Mode
## Salmon 15022  DESeqDataSet S4  
## STAR   55487  DESeqDataSet S4  
## HISAT2 55487  DESeqDataSet S4
summary(vsdList)
##        Length Class          Mode
## Salmon 15022  DESeqTransform S4  
## STAR   55487  DESeqTransform S4  
## HISAT2 55487  DESeqTransform S4

Estimating size factors

- black dashed line: size factor = 1

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Calculate and add size factors to the DEseq object
for (x in Aligners) {

    ddsList[[x]] <- estimateSizeFactors(ddsList[[x]])

}

# Set a function summarizing size factors by aligner to a data frame
sfactor.fn <- function(df, aligner) {

    sizefactor <- as.data.frame(round(sizeFactors(df), 3)) %>%
        rownames_to_column(var="Sample") %>%
        mutate(Aligner=aligner)

    names(sizefactor) <- c("Sample", "Size_Factor", "Aligner")

    return(sizefactor)

}

# Initialize a data frame with the first aligner 
size.factor.df <- sfactor.fn(ddsList[[1]], Aligners[1])


for (x in Aligners) {

    if (x != Aligners[1]) {

        size.factor.df <- rbind(size.factor.df, 
                                sfactor.fn(ddsList[[x]], x))
    }
}


# Explore the data frame
print(size.factor.df)
##      Sample Size_Factor Aligner
## 1   WT-rep1       1.115  Salmon
## 2   WT-rep2       1.000  Salmon
## 3   WT-rep3       0.975  Salmon
## 4  cKO-rep1       1.022  Salmon
## 5  cKO-rep2       1.040  Salmon
## 6  cKO-rep3       0.890  Salmon
## 7   WT-rep1       1.107    STAR
## 8   WT-rep2       1.000    STAR
## 9   WT-rep3       0.977    STAR
## 10 cKO-rep1       1.019    STAR
## 11 cKO-rep2       1.038    STAR
## 12 cKO-rep3       0.891    STAR
## 13  WT-rep1       1.107  HISAT2
## 14  WT-rep2       1.000  HISAT2
## 15  WT-rep3       0.976  HISAT2
## 16 cKO-rep1       1.019  HISAT2
## 17 cKO-rep2       1.038  HISAT2
## 18 cKO-rep3       0.891  HISAT2
# Convert character vectors to factors
size.factor.df$Sample <- factor(size.factor.df$Sample, 
                              levels=SampleNames)
size.factor.df$Aligner <- factor(size.factor.df$Aligner, 
                               levels=Aligners)

# Plot calculated size factors
comparing.barplot.fn(size.factor.df, 
                     size.factor.df$Size_Factor,  
                     "Size Factors by Aligner and Sample", 
                     "Size Factor") + geom_hline(yintercept=1, linetype="dashed", color="black", size=1)

Estimating dispersion and Wald test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

for (x in Aligners) {

    # Dispersion
    ddsList[[x]] <- estimateDispersions(ddsList[[x]])
    
    # Wald test
    ddsList[[x]] <- nbinomWaldTest(ddsList[[x]])

}


# Explore generated data in the dds object 
ddsList[[1]]
## class: DESeqDataSet 
## dim: 15022 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
##   ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(6): Sample Group ... HISAT2_path sizeFactor

Sample QC: Principal Component Analysis (PCA)

Identifies source of variation and sample outliers

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1]


# Set a function for sample pca
qcpca.fn <- function(obj, title) {

    plotPCA(obj,
        intgroup=GroupOfInterest,
        returnData=FALSE) + theme_bw() + ggtitle(paste("PCA:", title)) 

}

# Print the plots
qcpca.fn(vsdList[[1]], Aligners[1]) 

qcpca.fn(vsdList[[2]], Aligners[2])

qcpca.fn(vsdList[[3]], Aligners[3]) 

Sample QC: Sample Correlation Heatmap

Identifies distance between samples & correlation in a group

# Heatmap annotation
HeatmapAnno <- metadata[, c("Sample", "Group")]

# Set a function generating a correlation heatmap
cheatmap.fn <- function(df, title) {

    # Extract a normalized count matrix
    vm <- assay(df)

    # Generate a correlation matrix
    cm <- cor(vm)

    # Generate a heatmap
    pheatmap(cm, 
             annotation=HeatmapAnno, 
             main=paste("Sample Correlation Heatmap:", title))
}


# Print the heatmaps
cheatmap.fn(vsdList[[1]], Aligners[1])

cheatmap.fn(vsdList[[2]], Aligners[2])

cheatmap.fn(vsdList[[3]], Aligners[3])

Running DE analysis

# Run DESeq 
for (x in Aligners) {

    ddsList[[x]] <- DESeq(ddsList[[x]])
    # Check result names 
    ResNames <- resultsNames(ddsList[[x]])
    print(ResNames)

}
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Set a function plotting dispersion
dplot.fn <- function(dds, title) {

    plotDispEsts(dds, 
                 main=paste("Dispersion over Counts:", title))
}

# Plot dispersion patterns
dplot.fn(ddsList[[1]], Aligners[1])

dplot.fn(ddsList[[2]], Aligners[2])

dplot.fn(ddsList[[3]], Aligners[3])

# Do they fit well with the DESeq2 estimation model?

Setting how to extract fold-change results

Change variables below

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold alpha
alpha=0.1

# Set the coefficients to compare 
Coef <- ResNames[-1]
print(Coef) 
## [1] "Group_cKO_vs_WT"
# Set a function to clean a result table 
lfctable.fn <- function(df) {
    df <- df %>% 
        rownames_to_column(var="GENEID") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   "< 0.1", 
                                   "> 0.1")) 
    return(df)
}

# Set a function extracting results
extract.lfc.fn <- function(dds) {

    res <- results(dds, contrast=Contrast, alpha=alpha)
    lfctable.fn(as.data.frame(res))

    return(lfctable.fn(as.data.frame(res)))


}

Extracting log2FoldChanges

You can change alpha depending on your interest of FDR level

Shrinkage is NOT applied in this analysis

# Initialize a list storing lfc data frames
lfcList <- countList

# Extract DE results
# The Contrast variable was defined in the previous chunk
# Extraction with no shrinkage
# alpha: FDR threshold
for (x in Aligners) {

    lfcList[[x]] <- extract.lfc.fn(ddsList[[x]]) %>% mutate(Alignment=x)

    print(head(lfcList[[x]]))

}
##               GENEID    baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5887.836970    -0.04209378 0.05507976 -0.7642332
## 2 ENSMUSG00000000056 1503.299379    -0.09721473 0.07106900 -1.3678922
## 3 ENSMUSG00000000058    3.475901     0.91645280 1.11870668  0.8192074
## 4 ENSMUSG00000000078 3179.585578    -0.25212003 0.07128383 -3.5368476
## 5 ENSMUSG00000000088 4076.735801     0.01631577 0.05743487  0.2840742
## 6 ENSMUSG00000000093    6.727450     0.72813952 0.79661877  0.9140376
##         pvalue       padj   FDR Alignment
## 1 0.4447282643 0.84560461 > 0.1    Salmon
## 2 0.1713458110 0.58843581 > 0.1    Salmon
## 3 0.4126680866         NA > 0.1    Salmon
## 4 0.0004049332 0.01145908 < 0.1    Salmon
## 5 0.7763535171 0.95547829 > 0.1    Salmon
## 6 0.3606970840         NA > 0.1    Salmon
##               GENEID  baseMean log2FoldChange    lfcSE       stat    pvalue
## 1 ENSMUSG00000102693 0.1870490     -1.0278145 4.080473 -0.2518861 0.8011291
## 2 ENSMUSG00000064842 0.0000000             NA       NA         NA        NA
## 3 ENSMUSG00000051951 0.0000000             NA       NA         NA        NA
## 4 ENSMUSG00000102851 0.1870490     -1.0278145 4.080473 -0.2518861 0.8011291
## 5 ENSMUSG00000103377 0.4837499      2.3680696 3.152915  0.7510730 0.4526087
## 6 ENSMUSG00000104017 0.5047301      0.8868952 3.201764  0.2770020 0.7817786
##   padj   FDR Alignment
## 1   NA > 0.1      STAR
## 2   NA > 0.1      STAR
## 3   NA > 0.1      STAR
## 4   NA > 0.1      STAR
## 5   NA > 0.1      STAR
## 6   NA > 0.1      STAR
##               GENEID  baseMean log2FoldChange    lfcSE       stat    pvalue
## 1 ENSMUSG00000102693 0.1870355     -1.0280029 4.080473 -0.2519323 0.8010934
## 2 ENSMUSG00000064842 0.0000000             NA       NA         NA        NA
## 3 ENSMUSG00000051951 0.0000000             NA       NA         NA        NA
## 4 ENSMUSG00000102851 0.1870355     -1.0280029 4.080473 -0.2519323 0.8010934
## 5 ENSMUSG00000103377 0.4838032      2.3677771 3.123686  0.7580073 0.4484466
## 6 ENSMUSG00000104017 0.5050522      0.8868523 3.167217  0.2800099 0.7794699
##   padj   FDR Alignment
## 1   NA > 0.1    HISAT2
## 2   NA > 0.1    HISAT2
## 3   NA > 0.1    HISAT2
## 4   NA > 0.1    HISAT2
## 5   NA > 0.1    HISAT2
## 6   NA > 0.1    HISAT2
# Initialize a data frame storing total lfc results across the aligners
lfc.dataframe <- lfcList[[1]] 

for (x in Aligners[2:length(Aligners)]) {

    lfc.dataframe <- rbind(lfc.dataframe, 
                           lfcList[[x]])

}


lfc.dataframe$Alignment <- factor(lfc.dataframe$Alignment, 
                                  levels=Aligners)

Exploring distribution of false discovery rate (FDR)

Black dashed line: FDR = 0.1

# Plot distribution of FDR 
ggplot(lfc.dataframe, 
       aes(x=padj, y=..count.., color=Alignment)) + 
    geom_density(size=1) + 
    theme_bw() + 
    scale_x_log10() + 
    ggtitle("Distribution of False Discovery Rate (FDR) by Aligner") + 
    ylab("Count") +
    xlim(0.00001, 1) + 
    geom_vline(xintercept=alpha, 
               color="black", 
               size=1, linetype="dashed") + scale_x_continuous(breaks=seq(0, 1, by=0.1))

Presenting distribution of log2FoldChange

Black: total genes (padj =/= NA)

Colored: genes above or below FDR=0.1

valid.lfc.df <- subset(lfc.dataframe, FDR == "< 0.1")

ggplot(valid.lfc.df,
       aes(x=log2FoldChange,
           y=..count.., 
           color=Alignment)) +
geom_density(size=1) + 
theme_bw() + 
geom_vline(xintercept=c(-1, 1), 
           linetype="dashed", color="black", size=1) + 
ggtitle("Distribution of log2FoldChange Values by Aligner (FDR < 0.1)") +
ylab("Count") + 
xlim(-10, 10)    # Change xlim by datatype

Exploring mean-difference with an MA plot

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

# Set ylim: has to adjusted by users depending on data 
yl <- c(-10, 10)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Create MA plots by Aligner
ggplot(lfc.dataframe, aes(x=baseMean, y=log2FoldChange, color=FDR)) + 
        geom_point() + 
        facet_grid(~Alignment) +
        scale_x_log10() + 
        theme_bw() + 
        scale_color_manual(values=c("blue", "grey")) + 
        ggtitle(paste("MA plot")) + 
        ylim(yl[1], yl[2]) + 
        theme(strip.text.x=element_text(size=10)) +
        geom_hline(yintercept=c(mLog[1], mLog[2]), 
                   linetype="dashed", color="red") 

Exploring expression profiling with normalized count data

- Normalized count matrices are extracted from dds objects and filtered with thresholds set at FDR and log2FoldChange

- The heatmaps display z-scores of the normalized counts

- In this analysis, mLog = 1

- References: Harvard Chan Bioinformatics Core workshop, DESeq2 doc “Heatmap of the count matrix”

# Initialize a list 
heatmap.df.List <- lfcList

# Filter genes with FDR < alpha and absolute log2FoldChange > 1
for (x in Aligners) {

    # Set a logical vector filtering FDR below alpha
    is.fdr.valid <- lfcList[[x]]$FDR == paste("<", alpha)

    # Set a logical vector filtering absolute lfc above 1 
    is.lfc.large <- abs(lfcList[[x]]$log2FoldChange) > mLog[2]

    # Extract total normalized counts
    norm.counts <- counts(ddsList[[x]], normalized=T)

    # Save filtered genes only from the normalized count data
    heatmap.df.List[[x]] <- norm.counts[is.fdr.valid & is.lfc.large,]

}

# Explore the cleaned data frames 
head(heatmap.df.List[[1]])
##                      WT-rep1   WT-rep2   WT-rep3  cKO-rep1  cKO-rep2  cKO-rep3
## ENSMUSG00000000125  17.04733  17.00055  12.30765  19.56256  80.76408  48.31321
## ENSMUSG00000001020  75.36716  90.00293 124.10216  33.25635  33.65170  40.44827
## ENSMUSG00000001025 493.47547 611.01986 500.51120 244.53199 184.60362 214.60052
## ENSMUSG00000002228 200.08187 307.00998 338.46044 150.63171  91.34033 142.69249
## ENSMUSG00000007682  50.24478  50.00163  31.79477  88.03152 143.26010  80.89653
## ENSMUSG00000015533  64.60043  87.00283  93.33303  49.88453  22.11398  43.81895
head(heatmap.df.List[[2]])
##                       WT-rep1    WT-rep2    WT-rep3   cKO-rep1   cKO-rep2
## ENSMUSG00000025929  151.77009   57.97408   16.37872   7.848327   7.705361
## ENSMUSG00000041872  131.89543   62.97184   16.37872  14.715613  14.447553
## ENSMUSG00000047180 1469.82104 1545.30903 1586.68816 626.885122 470.027043
## ENSMUSG00000026072   49.68664   46.97899   57.32551  10.791450   5.779021
## ENSMUSG00000026070  542.93943  570.74480  901.85308 417.923415 242.718883
## ENSMUSG00000025969   33.42556   36.98346   30.71009  60.824535  78.016784
##                      cKO-rep3
## ENSMUSG00000025929   5.611470
## ENSMUSG00000041872   8.978352
## ENSMUSG00000047180 622.873185
## ENSMUSG00000026072  14.589822
## ENSMUSG00000026070 253.638450
## ENSMUSG00000025969  65.093054
head(heatmap.df.List[[3]])
##                       WT-rep1    WT-rep2    WT-rep3   cKO-rep1   cKO-rep2
## ENSMUSG00000025929  150.82592   57.99044   16.38913   7.853379   7.708966
## ENSMUSG00000041872  127.34404   58.99027   12.29185  13.743413  15.417932
## ENSMUSG00000047180 1458.58602 1537.74640 1576.42924 623.361945 468.319699
## ENSMUSG00000026072   49.67321   45.99242   57.36195  10.798396   5.781725
## ENSMUSG00000026070  525.63286  555.90832  872.72106 409.357372 235.123470
## ENSMUSG00000025969   34.31967   37.99373   32.77826  57.918669  82.871387
##                      cKO-rep3
## ENSMUSG00000025929   5.611066
## ENSMUSG00000041872   8.977705
## ENSMUSG00000047180 617.217229
## ENSMUSG00000026072  14.588771
## ENSMUSG00000026070 246.886892
## ENSMUSG00000025969  70.699428
dim(heatmap.df.List[[1]])
## [1] 74  6
dim(heatmap.df.List[[2]])
## [1] 236   6
dim(heatmap.df.List[[3]])
## [1] 240   6
pheatmap(heatmap.df.List[[3]], 
         annotation=HeatmapAnno,
         scale="row",
         show_rownames=F)

# Set a function creating a profiling heatmap
profile.heatmap.fn <- function(df, title) {

    pheatmap(df, 
             annotation=HeatmapAnno, 
             scale="row", 
             show_rownames=F,
             main=paste("Expression Profiling by", title, "(FDR < 0.1, absolute log2FoldChange > 1)"))
}

# Print the expression heatmaps
profile.heatmap.fn(heatmap.df.List[[Aligners[1]]], Aligners[1])

profile.heatmap.fn(heatmap.df.List[[Aligners[2]]], Aligners[2])

profile.heatmap.fn(heatmap.df.List[[Aligners[3]]], Aligners[3])

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 

# Create a data frame storing number of NA genes by type
NA.genes <- lfc.dataframe %>% 
    group_by(Alignment) %>% 
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>% 
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Alignment) %>% 
    mutate(Type=factor(case_when(Type == "zero" ~ type[1],
                                 Type == "outlier" ~ type[2],
                                 Type == "total" ~ type[3]),
                       levels=type))

# Plot number of NA genes 
ggplot(NA.genes, 
       aes(x=Type, y=Number, group=Alignment, fill=Alignment, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Ranking DEGs by alginer

- fdr.rank: ranked by FDR

- lfc.rank: ranked by absolute fold change

- up.lfc.rank: ranked by magnitude of fold change increase

- down.lfc.rank: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
fdr.rank <- lfcList
lfc.rank <- lfcList
up.lfc.rank <- lfcList
down.lfc.rank <- lfcList

# Set a sorting genes with FDR below alpha 
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == paste("<", alpha),])}

# Set a function creating a column assigning ranking 
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}


for (x in Aligners) {

    rdf <- lfcList[[x]]

    fdr.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(padj) %>% Ranking.fn()

    lfc.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()

    up.lfc.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(desc(log2FoldChange)) %>% Ranking.fn() 

    down.lfc.rank[[x]] <- filter.fdr.fn(rdf) %>% arrange(log2FoldChange) %>% Ranking.fn()

}



# Explore the ranking outputs
head(fdr.rank[[1]])
##                GENEID   baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSMUSG00000022018   645.5261       1.348896 0.1098847 12.27556 1.225431e-34
## 2: ENSMUSG00000018008  1210.5052       1.404828 0.1188426 11.82092 3.043495e-32
## 3: ENSMUSG00000047180  1006.8911       1.439987 0.1222790 11.77625 5.174627e-32
## 4: ENSMUSG00000032011 15998.0860       1.879407 0.1687060 11.14013 8.000394e-29
## 5: ENSMUSG00000020689  2312.2504       1.536737 0.1398468 10.98872 4.330317e-28
## 6: ENSMUSG00000040498   715.1294       2.226779 0.2078944 10.71111 9.027428e-27
##            padj   FDR Alignment Rank
## 1: 5.340430e-31 < 0.1    Salmon    1
## 2: 6.631776e-29 < 0.1    Salmon    2
## 3: 7.517008e-29 < 0.1    Salmon    3
## 4: 8.716430e-26 < 0.1    Salmon    4
## 5: 3.774304e-25 < 0.1    Salmon    5
## 6: 6.556922e-24 < 0.1    Salmon    6
head(lfc.rank[[1]])
##                GENEID baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000025929 39.59839       3.516588 0.7223282  4.868407 1.125012e-06
## 2: ENSMUSG00000061769 12.35596      -3.303917 0.9208850 -3.587762 3.335282e-04
## 3: ENSMUSG00000043932 15.02868      -3.218227 0.8504086 -3.784330 1.541231e-04
## 4: ENSMUSG00000037129 21.70641       2.999024 0.6362719  4.713432 2.435792e-06
## 5: ENSMUSG00000040653 10.25877       2.971537 0.8909262  3.335335 8.519683e-04
## 6: ENSMUSG00000041872 36.40167       2.642199 0.7245405  3.646724 2.656053e-04
##            padj   FDR Alignment Rank
## 1: 0.0001114273 < 0.1    Salmon    1
## 2: 0.0096901061 < 0.1    Salmon    2
## 3: 0.0055305321 < 0.1    Salmon    3
## 4: 0.0002166363 < 0.1    Salmon    4
## 5: 0.0197493512 < 0.1    Salmon    5
## 6: 0.0083877396 < 0.1    Salmon    6
head(up.lfc.rank[[1]])
##                GENEID baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSMUSG00000025929 39.59839       3.516588 0.7223282 4.868407 1.125012e-06
## 2: ENSMUSG00000037129 21.70641       2.999024 0.6362719 4.713432 2.435792e-06
## 3: ENSMUSG00000040653 10.25877       2.971537 0.8909262 3.335335 8.519683e-04
## 4: ENSMUSG00000041872 36.40167       2.642199 0.7245405 3.646724 2.656053e-04
## 5: ENSMUSG00000030217 25.85230       2.506255 0.5146045 4.870255 1.114545e-06
## 6: ENSMUSG00000070368 37.05782       2.393616 0.5450880 4.391246 1.127028e-05
##            padj   FDR Alignment Rank
## 1: 0.0001114273 < 0.1    Salmon    1
## 2: 0.0002166363 < 0.1    Salmon    2
## 3: 0.0197493512 < 0.1    Salmon    3
## 4: 0.0083877396 < 0.1    Salmon    4
## 5: 0.0001114273 < 0.1    Salmon    5
## 6: 0.0007674360 < 0.1    Salmon    6
head(down.lfc.rank[[1]])
##                GENEID baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSMUSG00000061769 12.35596      -3.303917 0.9208850 -3.587762 3.335282e-04
## 2: ENSMUSG00000043932 15.02868      -3.218227 0.8504086 -3.784330 1.541231e-04
## 3: ENSMUSG00000082901 12.01047      -2.408667 0.7691315 -3.131671 1.738145e-03
## 4: ENSMUSG00000051726 15.39281      -2.214000 0.5950714 -3.720562 1.987797e-04
## 5: ENSMUSG00000026241 17.91325      -2.213291 0.6521051 -3.394071 6.886185e-04
## 6: ENSMUSG00000090691 34.44154      -2.059095 0.4737681 -4.346209 1.385107e-05
##            padj   FDR Alignment Rank
## 1: 0.0096901061 < 0.1    Salmon    1
## 2: 0.0055305321 < 0.1    Salmon    2
## 3: 0.0330778872 < 0.1    Salmon    3
## 4: 0.0067577830 < 0.1    Salmon    4
## 5: 0.0169547993 < 0.1    Salmon    5
## 6: 0.0009286611 < 0.1    Salmon    6

Calculating rank difference between STAR and HISAT2

Salmon was excluded due to extremely small number of genes found

# Set a function rebuilding DE tables with gene ranks 
rankdiff.fn <- function(List){

    # Select columns and join the data frames by gene
    full_join(List[[Aligners[2]]][, .(GENEID, Alignment, Rank, baseMean)], 
              List[[Aligners[3]]][, .(GENEID, Alignment, Rank, baseMean)], 
              by="GENEID") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 

# Calculate rank difference by ranking type
fdr.rankdiff <- rankdiff.fn(fdr.rank)
lfc.rankdiff <- rankdiff.fn(lfc.rank)
up.lfc.rankdiff <- rankdiff.fn(up.lfc.rank)
down.lfc.rankdiff <- rankdiff.fn(down.lfc.rank)

# Explore the calculated rank differences
head(fdr.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000026463        STAR      1  1562.6504      HISAT2      1
## 2: ENSMUSG00000022018        STAR      2   686.0206      HISAT2      2
## 3: ENSMUSG00000018008        STAR      3  1295.3734      HISAT2      3
## 4: ENSMUSG00000047180        STAR      4  1053.6006      HISAT2      4
## 5: ENSMUSG00000020689        STAR      5  2430.9495      HISAT2      5
## 6: ENSMUSG00000040498        STAR      6   763.7041      HISAT2     41
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:  1537.4785          7.754220        0      1.0
## 2:   667.0280          6.927102        0      2.0
## 3:  1252.9425          7.561041        0      3.0
## 4:  1046.9434          7.363325        0      4.0
## 5:  2385.4226          8.195240        0      5.0
## 6:   743.7298          7.034889      -35     23.5
head(lfc.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000017002        STAR      3   12.81535      HISAT2      2
## 4: ENSMUSG00000061769        STAR      4   14.59555      HISAT2      3
## 5: ENSMUSG00000025929        STAR      5   41.21467      HISAT2      4
## 6: ENSMUSG00000043932        STAR      6   16.59888      HISAT2      5
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:   41.48362          4.144203        0      1.0
## 2:         NA                NA       NA       NA
## 3:   12.65005          2.951800        1      2.5
## 4:   14.27835          3.078911        1      3.5
## 5:   41.06315          4.123033        1      4.5
## 6:   16.44317          3.211669        1      5.5
head(up.lfc.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000017002        STAR      1   12.81535      HISAT2      1
## 2: ENSMUSG00000025929        STAR      2   41.21467      HISAT2      2
## 3: ENSMUSG00000017897        STAR      3  140.16560      HISAT2      5
## 4: ENSMUSG00000037129        STAR      4   22.03950      HISAT2      3
## 5: ENSMUSG00000034115        STAR      5   14.97109      HISAT2      4
## 6: ENSMUSG00000040653        STAR      6   11.92657      HISAT2      7
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:   12.65005          2.951800        0      1.0
## 2:   41.06315          4.123033        0      2.0
## 3:  132.92687          5.330925       -2      4.0
## 4:   21.69879          3.493135        1      3.5
## 5:   14.83145          3.108472        1      4.5
## 6:   10.47964          2.842953       -1      6.5
head(down.lfc.rankdiff)
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000061769        STAR      3   14.59555      HISAT2      2
## 4: ENSMUSG00000043932        STAR      4   16.59888      HISAT2      3
## 5: ENSMUSG00000058626        STAR      5   17.88988      HISAT2      5
## 6: ENSMUSG00000030178        STAR      6   21.10503      HISAT2      4
##    baseMean.y logMeanExpression RankDiff MeanRank
## 1:   41.48362          4.144203        0      1.0
## 2:         NA                NA       NA       NA
## 3:   14.27835          3.078911        1      2.5
## 4:   16.44317          3.211669        1      3.5
## 5:   17.73726          3.286853        0      5.0
## 6:   19.63052          3.431413        2      5.0

Visualizing DEG rankings

- x-axis: genes aligned by STAR

- y-axis: genes aligned by HISAT2

- Black diagonal lines: ranking in STAR = HISAT2

- Dot color: gene expression level (log-baseMean)

- 409 genes were missing in the plots

# Set a function plotting DEG rankings 
ranking.plot.fn <- function(df, rankedby) {

    ggplot(df, 
           aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Ranking in STAR") + ylab("Ranking in HISAT2") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste("Gene Ranking by", rankedby)) + scale_color_gradient(low="blue", high="red") 
}

# Plot rankings by ranking type
ranking.plot.fn(fdr.rankdiff, "FDR")

ranking.plot.fn(lfc.rankdiff, "Log2FoldChange")

ranking.plot.fn(up.lfc.rankdiff, "Log2FoldChange (Increase)")

ranking.plot.fn(down.lfc.rankdiff, "Log2FoldChange (Decrease)")

Visualizing DEG ranks II: Rank difference

- x-axis: expression level (log-baseMean)

- y-axis: rank difference (STAR ranking - HISAT2 ranking)

- Black horizontal lines: ranking in STAR = HISAT2

- Dot color: average ranking

- 409 genes were missing in the plots

# Set a function plotting DEG rank difference
rankdiff.plot.fn <- function(df, rankedby, ylim) { 

    ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) + 
        geom_point(alpha=0.5) + 
        theme_bw() +         
        ylab("Rank Difference (STAR - HISAT2)") + 
        ggtitle(paste("Rank Difference (STAR - HISAT2)\n in", rankedby)) + 
        geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") +
        ylim(-ylim, ylim)

}



# Display the plots in the same y-scale
rankdiff.plot.fn(fdr.rankdiff, "FDR", 1100)

rankdiff.plot.fn(lfc.rankdiff, "Log2FoldChange", 1100)

rankdiff.plot.fn(up.lfc.rankdiff, "Log2FoldChange (Increase)", 1100)

rankdiff.plot.fn(down.lfc.rankdiff, "Log2FoldChange (Decrease)", 1100)

# Display the plots in free y-scale
rankdiff.plot.fn(fdr.rankdiff, "FDR (free y-scale)", 1100)

rankdiff.plot.fn(lfc.rankdiff, "Log2FoldChange (free y-scale)", 500)

rankdiff.plot.fn(up.lfc.rankdiff, "Log2FoldChange (Increase, free y-scale)", 150)

rankdiff.plot.fn(down.lfc.rankdiff, "Log2FoldChange (Decrease, free y-scale)", 250)

Distribution of rank difference

# Create a list storing rankdiff data frames 
rankList <- list(fdr.rankdiff,
                 lfc.rankdiff,
                 up.lfc.rankdiff,
                 down.lfc.rankdiff)


# Assine result column as a factor with levels 
rankdiff.levels <- c("FDR", 
                     "log2FoldChange", 
                     "log2FoldChange.Increase", 
                     "log2FoldChange.Decrease")

# Name the list 
names(rankList) <- rankdiff.levels


# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff), 
                            log2FoldChange=abs(rankList[[2]]$RankDiff),
                            log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
                            log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff) 

rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)

# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
       aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) + 
geom_boxplot(alpha=0.5, fill="grey", color="black") + 
theme_bw() + 
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n (STAR - HISAT2)") + 
ylab("Absolute Rank Difference (STAR - HISAT2)") 

Relationship between rank difference and number of transcript versions

- y-axis: abs(TPM-Count inputs)

- x-axis: number of transcripts (number of transcript id / gene id)

- dot color: mean rank

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(ENSEMBL) %>% 
    summarize(num.trans=n())

# Set a function adding the number of transcripts by gene id 
add.ntrans.fn <- function(df) {

    left_join(df, AnnoDb.ntrans, by=c("GENEID"="ENSEMBL"))}




# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {

    rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}


# Explore the edited data frames
summary(rankList)
##                         Length Class      Mode
## FDR                     11     data.table list
## log2FoldChange          11     data.table list
## log2FoldChange.Increase 11     data.table list
## log2FoldChange.Decrease 11     data.table list
head(rankList[[1]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000026463        STAR      1  1562.6504      HISAT2      1
## 2: ENSMUSG00000022018        STAR      2   686.0206      HISAT2      2
## 3: ENSMUSG00000018008        STAR      3  1295.3734      HISAT2      3
## 4: ENSMUSG00000047180        STAR      4  1053.6006      HISAT2      4
## 5: ENSMUSG00000020689        STAR      5  2430.9495      HISAT2      5
## 6: ENSMUSG00000040498        STAR      6   763.7041      HISAT2     41
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:  1537.4785          7.754220        0      1.0        NA
## 2:   667.0280          6.927102        0      2.0         1
## 3:  1252.9425          7.561041        0      3.0        13
## 4:  1046.9434          7.363325        0      4.0         2
## 5:  2385.4226          8.195240        0      5.0         2
## 6:   743.7298          7.034889      -35     23.5         4
head(rankList[[2]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000017002        STAR      3   12.81535      HISAT2      2
## 4: ENSMUSG00000061769        STAR      4   14.59555      HISAT2      3
## 5: ENSMUSG00000025929        STAR      5   41.21467      HISAT2      4
## 6: ENSMUSG00000043932        STAR      6   16.59888      HISAT2      5
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   41.48362          4.144203        0      1.0        NA
## 2:         NA                NA       NA       NA         1
## 3:   12.65005          2.951800        1      2.5        NA
## 4:   14.27835          3.078911        1      3.5         1
## 5:   41.06315          4.123033        1      4.5         1
## 6:   16.44317          3.211669        1      5.5         1
head(rankList[[3]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000017002        STAR      1   12.81535      HISAT2      1
## 2: ENSMUSG00000025929        STAR      2   41.21467      HISAT2      2
## 3: ENSMUSG00000017897        STAR      3  140.16560      HISAT2      5
## 4: ENSMUSG00000037129        STAR      4   22.03950      HISAT2      3
## 5: ENSMUSG00000034115        STAR      5   14.97109      HISAT2      4
## 6: ENSMUSG00000040653        STAR      6   11.92657      HISAT2      7
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   12.65005          2.951800        0      1.0        NA
## 2:   41.06315          4.123033        0      2.0         1
## 3:  132.92687          5.330925       -2      4.0        NA
## 4:   21.69879          3.493135        1      3.5         1
## 5:   14.83145          3.108472        1      4.5        NA
## 6:   10.47964          2.842953       -1      6.5         2
head(rankList[[4]])
##                GENEID Alignment.x Rank.x baseMean.x Alignment.y Rank.y
## 1: ENSMUSG00000103587        STAR      1   42.32553      HISAT2      1
## 2: ENSMUSG00000020713        STAR      2   11.98750        <NA>     NA
## 3: ENSMUSG00000061769        STAR      3   14.59555      HISAT2      2
## 4: ENSMUSG00000043932        STAR      4   16.59888      HISAT2      3
## 5: ENSMUSG00000058626        STAR      5   17.88988      HISAT2      5
## 6: ENSMUSG00000030178        STAR      6   21.10503      HISAT2      4
##    baseMean.y logMeanExpression RankDiff MeanRank num.trans
## 1:   41.48362          4.144203        0      1.0        NA
## 2:         NA                NA       NA       NA         1
## 3:   14.27835          3.078911        1      2.5         1
## 4:   16.44317          3.211669        1      3.5         1
## 5:   17.73726          3.286853        0      5.0        NA
## 6:   19.63052          3.431413        2      5.0        NA
# Set a function plotting rank difference vs number of transcripts 
rank.ntrans.plot.fn <- function(df, title) {

    ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) + 
        geom_jitter(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \n in", title)) + 
        xlab("Number of Alternative Transcripts") + 
        ylab("Absolute Rank Difference \n (STAR - HISAT2)") + scale_color_gradient(low="blue", high="red") 
}

# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")

rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")

rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")

rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")

Finding genes having large difference in rankings

# Initialize a list storing rankdiff genes 
large.rankdiff <- rankList

# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)

names(rankdiff.thr) <- rankdiff.levels

for (x in rankdiff.levels) {

    # Filter out observations below the rankdiff thresholds
    large.rankdiff[[x]] <- subset(rankList[[x]], 
                                      abs(RankDiff) > rankdiff.thr[x]) 

}

# Explore the filtered genes 
summary(large.rankdiff)
##                         Length Class      Mode
## FDR                     11     data.table list
## log2FoldChange          11     data.table list
## log2FoldChange.Increase 11     data.table list
## log2FoldChange.Decrease 11     data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 1086   11
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 847  11
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1] 727  11
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 1124   11

Summarizing up/down DEGs with an upset plot

red bar: aligner

blue bar: directionality of gene expression change

# Generate a data frame storing upset input variables
upset.dataframe <- subset(lfc.dataframe, !is.na(padj)) %>%
    
    mutate(Up=ifelse(FDR == paste("<", alpha) & log2FoldChange > 0, GENEID, ""), 

           Down=ifelse(FDR == paste("<", alpha) & log2FoldChange < 0, GENEID, ""),

           Unchanged=ifelse(FDR == paste(">", alpha), GENEID, ""),

           Salmon=ifelse(Alignment == "Salmon", GENEID, ""), 

           STAR=ifelse(Alignment == "STAR", GENEID, ""),

           HISAT2=ifelse(Alignment == "HISAT2", GENEID, ""))

# Generate a list input 
upset.input <- list(Up=upset.dataframe$Up, 
                    Down=upset.dataframe$Down,
                    Unchanged=upset.dataframe$Unchanged, 
                    Salmon=upset.dataframe$Salmon,
                    STAR=upset.dataframe$STAR,
                    HISAT2=upset.dataframe$HISAT2)

# Create an upset plot
upset(fromList(upset.input), 
      sets=names(upset.input),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red", "blue", "red", "blue", "blue"),
      text.scale = 1.5, number.angles=30)

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] AnnotationDbi_1.52.0        UpSetR_1.4.0               
##  [3] DESeq2_1.30.0               SummarizedExperiment_1.20.0
##  [5] Biobase_2.50.0              MatrixGenerics_1.2.0       
##  [7] matrixStats_0.57.0          GenomicRanges_1.42.0       
##  [9] GenomeInfoDb_1.26.2         IRanges_2.24.0             
## [11] S4Vectors_0.28.1            Rsubread_2.4.0             
## [13] tximport_1.18.0             AnnotationHub_2.22.0       
## [15] BiocFileCache_1.14.0        dbplyr_2.0.0               
## [17] BiocGenerics_0.36.0         pheatmap_1.0.12            
## [19] rmarkdown_2.5               forcats_0.5.0              
## [21] stringr_1.4.0               dplyr_1.0.2                
## [23] purrr_0.3.4                 readr_1.4.0                
## [25] tidyr_1.1.2                 tibble_3.0.4               
## [27] ggplot2_3.3.2               tidyverse_1.3.0            
## [29] data.table_1.13.4          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0              ellipsis_0.3.1               
##  [3] XVector_0.30.0                fs_1.5.0                     
##  [5] rstudioapi_0.13               farver_2.0.3                 
##  [7] bit64_4.0.5                   interactiveDisplayBase_1.28.0
##  [9] fansi_0.4.1                   lubridate_1.7.9.2            
## [11] xml2_1.3.2                    splines_4.0.3                
## [13] geneplotter_1.68.0            knitr_1.30                   
## [15] jsonlite_1.7.2                broom_0.7.2                  
## [17] annotate_1.68.0               shiny_1.5.0                  
## [19] BiocManager_1.30.10           compiler_4.0.3               
## [21] httr_1.4.2                    backports_1.2.1              
## [23] assertthat_0.2.1              Matrix_1.2-18                
## [25] fastmap_1.0.1                 cli_2.2.0                    
## [27] later_1.1.0.1                 htmltools_0.5.0              
## [29] tools_4.0.3                   gtable_0.3.0                 
## [31] glue_1.4.2                    GenomeInfoDbData_1.2.4       
## [33] rappdirs_0.3.1                Rcpp_1.0.5                   
## [35] cellranger_1.1.0              vctrs_0.3.5                  
## [37] xfun_0.19                     ps_1.5.0                     
## [39] rvest_0.3.6                   mime_0.9                     
## [41] lifecycle_0.2.0               XML_3.99-0.5                 
## [43] zlibbioc_1.36.0               scales_1.1.1                 
## [45] hms_0.5.3                     promises_1.1.1               
## [47] RColorBrewer_1.1-2            yaml_2.2.1                   
## [49] curl_4.3                      memoise_1.1.0                
## [51] gridExtra_2.3                 stringi_1.5.3                
## [53] RSQLite_2.2.1                 BiocVersion_3.12.0           
## [55] genefilter_1.72.0             BiocParallel_1.24.1          
## [57] rlang_0.4.9                   pkgconfig_2.0.3              
## [59] bitops_1.0-6                  evaluate_0.14                
## [61] lattice_0.20-41               labeling_0.4.2               
## [63] bit_4.0.4                     tidyselect_1.1.0             
## [65] plyr_1.8.6                    magrittr_2.0.1               
## [67] R6_2.5.0                      generics_0.1.0               
## [69] DelayedArray_0.16.0           DBI_1.1.0                    
## [71] pillar_1.4.7                  haven_2.3.1                  
## [73] withr_2.3.0                   survival_3.2-7               
## [75] RCurl_1.98-1.2                modelr_0.1.8                 
## [77] crayon_1.3.4                  locfit_1.5-9.4               
## [79] grid_4.0.3                    readxl_1.3.1                 
## [81] blob_1.2.1                    reprex_0.3.0                 
## [83] digest_0.6.27                 xtable_1.8-4                 
## [85] httpuv_1.5.4                  munsell_0.5.0